In [38]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
In [39]:
df = pd.read_csv("Data.csv")
In [40]:
df
Out[40]:
Country Age Salary Purchased
0 France 44.0 72000.0 Yes
1 Spain 27.0 48000.0 Yes
2 NaN 30.0 54000.0 NaN
3 Spain 38.0 61000.0 No
4 Germany 40.0 NaN Yes
5 France 35.0 58000.0 Yes
6 Spain NaN 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes

1. Removing the Rows¶

In [41]:
df.dropna()
Out[41]:
Country Age Salary Purchased
0 France 44.0 72000.0 Yes
1 Spain 27.0 48000.0 Yes
3 Spain 38.0 61000.0 No
5 France 35.0 58000.0 Yes
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes

2. Imputers¶

In [42]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

imputer.fit(df.iloc[:,1:3].values)

df.iloc[:,1:3] = imputer.transform(df.iloc[:,1:3].values)


# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df
Out[42]:
Country Age Salary Purchased
0 France 44.0 72000.0 Yes
1 Spain 27.0 48000.0 Yes
2 NaN 30.0 54000.0 NaN
3 Spain 38.0 61000.0 No
4 Germany 40.0 48000.0 Yes
5 France 35.0 58000.0 Yes
6 Spain 27.0 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [43]:
imputer.fit(df.iloc[:,1:3].values)
Out[43]:
SimpleImputer(strategy='most_frequent')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SimpleImputer(strategy='most_frequent')
In [44]:
x = imputer.transform(df.iloc[:,1:3].values)
In [45]:
df.iloc[:,1:3]=x
In [46]:
df
Out[46]:
Country Age Salary Purchased
0 France 44.0 72000.0 Yes
1 Spain 27.0 48000.0 Yes
2 NaN 30.0 54000.0 NaN
3 Spain 38.0 61000.0 No
4 Germany 40.0 48000.0 Yes
5 France 35.0 58000.0 Yes
6 Spain 27.0 52000.0 No
7 France 48.0 79000.0 Yes
8 Germany 50.0 83000.0 No
9 France 37.0 67000.0 Yes
In [ ]: